/*
 * kmp_barrier.cpp
 */


//===----------------------------------------------------------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.txt for details.
//
//===----------------------------------------------------------------------===//


#include "kmp.h"
#include "kmp_wait_release.h"
#include "kmp_stats.h"
#include "kmp_itt.h"
#include "kmp_os.h"


#if KMP_MIC
#include <immintrin.h>
#define USE_NGO_STORES 1
#endif // KMP_MIC

#if KMP_MIC && USE_NGO_STORES
// ICV copying
#define ngo_load(src)            __m512d Vt = _mm512_load_pd((void *)(src))
#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
#define ngo_store_go(dst, src)   _mm512_storenrngo_pd((void *)(dst), Vt)
#define ngo_sync()               __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
#else
#define ngo_load(src)            ((void)0)
#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
#define ngo_store_go(dst, src)   KMP_MEMCPY((dst), (src), CACHE_LINE)
#define ngo_sync()               ((void)0)
#endif /* KMP_MIC && USE_NGO_STORES */

void __kmp_print_structure(void); // Forward declaration

// ---------------------------- Barrier Algorithms ----------------------------

// Linear Barrier
static void
__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
                            void (*reduce)(void *, void *)
                            USE_ITT_BUILD_ARG(void * itt_sync_obj) )
{
    KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather);
    register kmp_team_t *team = this_thr->th.th_team;
    register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
    register kmp_info_t **other_threads = team->t.t_threads;

    KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
    KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);

#if USE_ITT_BUILD && USE_ITT_NOTIFY
    // Barrier imbalance - save arrive time to the thread
    if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
        this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
    }
#endif
    // We now perform a linear reduction to signal that all of the threads have arrived.
    if (!KMP_MASTER_TID(tid)) {
        KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
                      "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
                      __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
                      thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
        // Mark arrival to master thread
        /* After performing this write, a worker thread may not assume that the team is valid
           any more - it could be deallocated by the master thread at any time. */
        kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
        flag.release();
    } else {
        register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
        register int nproc = this_thr->th.th_team_nproc;
        register int i;
        // Don't have to worry about sleep bit here or atomic since team setting
        register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;

        // Collect all the worker team member threads.
        for (i=1; i<nproc; ++i) {
#if KMP_CACHE_MANAGE
            // Prefetch next thread's arrived count
            if (i+1 < nproc)
                KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
#endif /* KMP_CACHE_MANAGE */
            KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
                          "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
                            __kmp_gtid_from_tid(i, team), team->t.t_id, i,
                            &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));

            // Wait for worker thread to arrive
            kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
            flag.wait(this_thr, FALSE
                      USE_ITT_BUILD_ARG(itt_sync_obj) );
#if USE_ITT_BUILD && USE_ITT_NOTIFY
            // Barrier imbalance - write min of the thread time and the other thread time to the thread.
            if (__kmp_forkjoin_frames_mode == 2) {
                this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
                                                          other_threads[i]->th.th_bar_min_time);
            }
#endif
            if (reduce) {
                KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
                               team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
                (*reduce)(this_thr->th.th_local.reduce_data,
                          other_threads[i]->th.th_local.reduce_data);
            }
        }
        // Don't have to worry about sleep bit here or atomic since team setting
        team_bar->b_arrived = new_state;
        KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
                      gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
    }
    KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
}

static void
__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
                             int propagate_icvs
                             USE_ITT_BUILD_ARG(void *itt_sync_obj) )
{
    KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release);
    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
    register kmp_team_t *team;

    if (KMP_MASTER_TID(tid)) {
        register unsigned int i;
        register kmp_uint32 nproc = this_thr->th.th_team_nproc;
        register kmp_info_t **other_threads;

        team = __kmp_threads[gtid]->th.th_team;
        KMP_DEBUG_ASSERT(team != NULL);
        other_threads = team->t.t_threads;

        KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
                      gtid, team->t.t_id, tid, bt));

        if (nproc > 1) {
#if KMP_BARRIER_ICV_PUSH
            {
                KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
                if (propagate_icvs) {
                    ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
                    for (i=1; i<nproc; ++i) {
                        __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
                        ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
                                       &team->t.t_implicit_task_taskdata[0].td_icvs);
                    }
                    ngo_sync();
                }
            }
#endif // KMP_BARRIER_ICV_PUSH

            // Now, release all of the worker threads
            for (i=1; i<nproc; ++i) {
#if KMP_CACHE_MANAGE
                // Prefetch next thread's go flag
                if (i+1 < nproc)
                    KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
#endif /* KMP_CACHE_MANAGE */
                KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
                              "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
                              other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
                              &other_threads[i]->th.th_bar[bt].bb.b_go,
                              other_threads[i]->th.th_bar[bt].bb.b_go,
                              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
                kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
                flag.release();
            }
        }
    } else { // Wait for the MASTER thread to release us
        KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
                      gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
        kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
        flag.wait(this_thr, TRUE
                  USE_ITT_BUILD_ARG(itt_sync_obj) );
#if USE_ITT_BUILD && USE_ITT_NOTIFY
        if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
            // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
            // Cancel wait on previous parallel region...
            __kmp_itt_task_starting(itt_sync_obj);

            if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
                return;

            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
            if (itt_sync_obj != NULL)
                // Call prepare as early as possible for "new" barrier
                __kmp_itt_task_finished(itt_sync_obj);
        } else
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
        // Early exit for reaping threads releasing forkjoin barrier
        if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
            return;
        // The worker thread may now assume that the team is valid.
#ifdef KMP_DEBUG
        tid = __kmp_tid_from_gtid(gtid);
        team = __kmp_threads[gtid]->th.th_team;
#endif
        KMP_DEBUG_ASSERT(team != NULL);
        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
        KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
                      gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
        KMP_MB();  // Flush all pending memory write invalidates.
    }
    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
}

// Tree barrier
static void
__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
                          void (*reduce)(void *, void *)
                          USE_ITT_BUILD_ARG(void *itt_sync_obj) )
{
    KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather);
    register kmp_team_t *team = this_thr->th.th_team;
    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
    register kmp_info_t **other_threads = team->t.t_threads;
    register kmp_uint32 nproc = this_thr->th.th_team_nproc;
    register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
    register kmp_uint32 branch_factor = 1 << branch_bits;
    register kmp_uint32 child;
    register kmp_uint32 child_tid;
    register kmp_uint64 new_state;

    KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
    KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);

#if USE_ITT_BUILD && USE_ITT_NOTIFY
    // Barrier imbalance - save arrive time to the thread
    if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
        this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
    }
#endif
    // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
    child_tid = (tid << branch_bits) + 1;
    if (child_tid < nproc) {
        // Parent threads wait for all their children to arrive
        new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
        child = 1;
        do {
            register kmp_info_t *child_thr = other_threads[child_tid];
            register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
#if KMP_CACHE_MANAGE
            // Prefetch next thread's arrived count
            if (child+1 <= branch_factor && child_tid+1 < nproc)
                KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
#endif /* KMP_CACHE_MANAGE */
            KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
                          "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
                            &child_bar->b_arrived, new_state));
            // Wait for child to arrive
            kmp_flag_64 flag(&child_bar->b_arrived, new_state);
            flag.wait(this_thr, FALSE
                      USE_ITT_BUILD_ARG(itt_sync_obj) );
#if USE_ITT_BUILD && USE_ITT_NOTIFY
            // Barrier imbalance - write min of the thread time and a child time to the thread.
            if (__kmp_forkjoin_frames_mode == 2) {
                this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
                                                          child_thr->th.th_bar_min_time);
            }
#endif
            if (reduce) {
                KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
                               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                               team->t.t_id, child_tid));
                (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
            }
            child++;
            child_tid++;
        }
        while (child <= branch_factor && child_tid < nproc);
    }

    if (!KMP_MASTER_TID(tid)) { // Worker threads
        register kmp_int32 parent_tid = (tid - 1) >> branch_bits;

        KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
                      "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
                      __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
                      &thr_bar->b_arrived, thr_bar->b_arrived,
                      thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));

        // Mark arrival to parent thread
        /* After performing this write, a worker thread may not assume that the team is valid
           any more - it could be deallocated by the master thread at any time.  */
        kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
        flag.release();
    } else {
        // Need to update the team arrived pointer if we are the master thread
        if (nproc > 1) // New value was already computed above
            team->t.t_bar[bt].b_arrived = new_state;
        else
            team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
        KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
                      gtid, team->t.t_id, tid, team->t.t_id,
                      &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
    }
    KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
}

static void
__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
                           int propagate_icvs
                           USE_ITT_BUILD_ARG(void *itt_sync_obj) )
{
    KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release);
    register kmp_team_t *team;
    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
    register kmp_uint32 nproc;
    register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
    register kmp_uint32 branch_factor = 1 << branch_bits;
    register kmp_uint32 child;
    register kmp_uint32 child_tid;

    // Perform a tree release for all of the threads that have been gathered
    if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
        KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
                      gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
        // Wait for parent thread to release us
        kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
        flag.wait(this_thr, TRUE
                  USE_ITT_BUILD_ARG(itt_sync_obj) );
#if USE_ITT_BUILD && USE_ITT_NOTIFY
        if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
            // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
            // Cancel wait on previous parallel region...
            __kmp_itt_task_starting(itt_sync_obj);

            if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
                return;

            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
            if (itt_sync_obj != NULL)
                // Call prepare as early as possible for "new" barrier
                __kmp_itt_task_finished(itt_sync_obj);
        } else
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
        // Early exit for reaping threads releasing forkjoin barrier
        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
            return;

        // The worker thread may now assume that the team is valid.
        team = __kmp_threads[gtid]->th.th_team;
        KMP_DEBUG_ASSERT(team != NULL);
        tid = __kmp_tid_from_gtid(gtid);

        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
        KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
                      gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
        KMP_MB();  // Flush all pending memory write invalidates.
    } else {
        team = __kmp_threads[gtid]->th.th_team;
        KMP_DEBUG_ASSERT(team != NULL);
        KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
                      gtid, team->t.t_id, tid, bt));
    }
    nproc = this_thr->th.th_team_nproc;
    child_tid = (tid << branch_bits) + 1;

    if (child_tid < nproc) {
        register kmp_info_t **other_threads = team->t.t_threads;
        child = 1;
        // Parent threads release all their children
        do {
            register kmp_info_t *child_thr = other_threads[child_tid];
            register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
#if KMP_CACHE_MANAGE
            // Prefetch next thread's go count
            if (child+1 <= branch_factor && child_tid+1 < nproc)
                KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
#endif /* KMP_CACHE_MANAGE */

#if KMP_BARRIER_ICV_PUSH
            {
                KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
                if (propagate_icvs) {
                    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
                                             team, child_tid, FALSE);
                    copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
                              &team->t.t_implicit_task_taskdata[0].td_icvs);
                }
            }
#endif // KMP_BARRIER_ICV_PUSH
            KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
                          "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
                          __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                          child_tid, &child_bar->b_go, child_bar->b_go,
                          child_bar->b_go + KMP_BARRIER_STATE_BUMP));
            // Release child from barrier
            kmp_flag_64 flag(&child_bar->b_go, child_thr);
            flag.release();
            child++;
            child_tid++;
        }
        while (child <= branch_factor && child_tid < nproc);
    }
    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
}


// Hyper Barrier
static void
__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
                           void (*reduce)(void *, void *)
                           USE_ITT_BUILD_ARG(void *itt_sync_obj) )
{
    KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather);
    register kmp_team_t *team = this_thr->th.th_team;
    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
    register kmp_info_t **other_threads = team->t.t_threads;
    register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
    register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
    register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
    register kmp_uint32 branch_factor = 1 << branch_bits;
    register kmp_uint32 offset;
    register kmp_uint32 level;

    KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));

    KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);

#if USE_ITT_BUILD && USE_ITT_NOTIFY
    // Barrier imbalance - save arrive time to the thread
    if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
        this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
    }
#endif
    /* Perform a hypercube-embedded tree gather to wait until all of the threads have
       arrived, and reduce any required data as we go.  */
    kmp_flag_64 p_flag(&thr_bar->b_arrived);
    for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
    {
        register kmp_uint32 child;
        register kmp_uint32 child_tid;

        if (((tid >> level) & (branch_factor - 1)) != 0) {
            register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);

            KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
                          "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
                          __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
                          &thr_bar->b_arrived, thr_bar->b_arrived,
                          thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
            // Mark arrival to parent thread
            /* After performing this write (in the last iteration of the enclosing for loop),
               a worker thread may not assume that the team is valid any more - it could be
               deallocated by the master thread at any time.  */
            p_flag.set_waiter(other_threads[parent_tid]);
            p_flag.release();
            break;
        }

        // Parent threads wait for children to arrive
        if (new_state == KMP_BARRIER_UNUSED_STATE)
            new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
        for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
             child++, child_tid+=(1 << level))
        {
            register kmp_info_t *child_thr = other_threads[child_tid];
            register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
#if KMP_CACHE_MANAGE
            register kmp_uint32 next_child_tid = child_tid + (1 << level);
            // Prefetch next thread's arrived count
            if (child+1 < branch_factor && next_child_tid < num_threads)
                KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
#endif /* KMP_CACHE_MANAGE */
            KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
                          "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
                          __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
                          &child_bar->b_arrived, new_state));
            // Wait for child to arrive
            kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
            c_flag.wait(this_thr, FALSE
                        USE_ITT_BUILD_ARG(itt_sync_obj) );
#if USE_ITT_BUILD && USE_ITT_NOTIFY
            // Barrier imbalance - write min of the thread time and a child time to the thread.
            if (__kmp_forkjoin_frames_mode == 2) {
                this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
                                                          child_thr->th.th_bar_min_time);
            }
#endif
            if (reduce) {
                KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
                               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                               team->t.t_id, child_tid));
                (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
            }
        }
    }

    if (KMP_MASTER_TID(tid)) {
        // Need to update the team arrived pointer if we are the master thread
        if (new_state == KMP_BARRIER_UNUSED_STATE)
            team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
        else
            team->t.t_bar[bt].b_arrived = new_state;
        KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
                      gtid, team->t.t_id, tid, team->t.t_id,
                      &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
    }
    KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
}

// The reverse versions seem to beat the forward versions overall
#define KMP_REVERSE_HYPER_BAR
static void
__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
                            int propagate_icvs
                            USE_ITT_BUILD_ARG(void *itt_sync_obj) )
{
    KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release);
    register kmp_team_t    *team;
    register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
    register kmp_info_t   **other_threads;
    register kmp_uint32     num_threads;
    register kmp_uint32     branch_bits   = __kmp_barrier_release_branch_bits[ bt ];
    register kmp_uint32     branch_factor = 1 << branch_bits;
    register kmp_uint32     child;
    register kmp_uint32     child_tid;
    register kmp_uint32     offset;
    register kmp_uint32     level;

    /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
       If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
       order of the corresponding gather, otherwise threads are released in the same order. */
    if (KMP_MASTER_TID(tid)) { // master
        team = __kmp_threads[gtid]->th.th_team;
        KMP_DEBUG_ASSERT(team != NULL);
        KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
                      gtid, team->t.t_id, tid, bt));
#if KMP_BARRIER_ICV_PUSH
        if (propagate_icvs) { // master already has ICVs in final destination; copy
            copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
        }
#endif
    }
    else  { // Handle fork barrier workers who aren't part of a team yet
        KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
                      gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
        // Wait for parent thread to release us
        kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
        flag.wait(this_thr, TRUE
                  USE_ITT_BUILD_ARG(itt_sync_obj) );
#if USE_ITT_BUILD && USE_ITT_NOTIFY
        if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
            // In fork barrier where we could not get the object reliably
            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
            // Cancel wait on previous parallel region...
            __kmp_itt_task_starting(itt_sync_obj);

            if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
                return;

            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
            if (itt_sync_obj != NULL)
                // Call prepare as early as possible for "new" barrier
                __kmp_itt_task_finished(itt_sync_obj);
        } else
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
        // Early exit for reaping threads releasing forkjoin barrier
        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
            return;

        // The worker thread may now assume that the team is valid.
        team = __kmp_threads[gtid]->th.th_team;
        KMP_DEBUG_ASSERT(team != NULL);
        tid = __kmp_tid_from_gtid(gtid);

        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
        KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
                      gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
        KMP_MB();  // Flush all pending memory write invalidates.
    }
    num_threads = this_thr->th.th_team_nproc;
    other_threads = team->t.t_threads;

#ifdef KMP_REVERSE_HYPER_BAR
    // Count up to correct level for parent
    for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
         level+=branch_bits, offset<<=branch_bits);

    // Now go down from there
    for (level-=branch_bits, offset>>=branch_bits; offset != 0;
         level-=branch_bits, offset>>=branch_bits)
#else
    // Go down the tree, level by level
    for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
#endif // KMP_REVERSE_HYPER_BAR
    {
#ifdef KMP_REVERSE_HYPER_BAR
        /* Now go in reverse order through the children, highest to lowest.
           Initial setting of child is conservative here. */
        child = num_threads >> ((level==0)?level:level-1);
        for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
             child>=1; child--, child_tid-=(1<<level))
#else
        if (((tid >> level) & (branch_factor - 1)) != 0)
            // No need to go lower than this, since this is the level parent would be notified
            break;
        // Iterate through children on this level of the tree
        for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
             child++, child_tid+=(1<<level))
#endif // KMP_REVERSE_HYPER_BAR
        {
            if (child_tid >= num_threads) continue;  // Child doesn't exist so keep going
            else {
                register kmp_info_t *child_thr = other_threads[child_tid];
                register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
#if KMP_CACHE_MANAGE
                register kmp_uint32 next_child_tid = child_tid - (1 << level);
                // Prefetch next thread's go count
# ifdef KMP_REVERSE_HYPER_BAR
                if (child-1 >= 1 && next_child_tid < num_threads)
# else
                if (child+1 < branch_factor && next_child_tid < num_threads)
# endif // KMP_REVERSE_HYPER_BAR
                    KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
#endif /* KMP_CACHE_MANAGE */

#if KMP_BARRIER_ICV_PUSH
                if (propagate_icvs) // push my fixed ICVs to my child
                    copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
#endif // KMP_BARRIER_ICV_PUSH

                KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
                              "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
                              __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                              child_tid, &child_bar->b_go, child_bar->b_go,
                              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
                // Release child from barrier
                kmp_flag_64 flag(&child_bar->b_go, child_thr);
                flag.release();
            }
        }
    }
#if KMP_BARRIER_ICV_PUSH
    if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
        __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
    }
#endif
    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
}

// Hierarchical Barrier

// Initialize thread barrier data
/* Initializes/re-initializes the hierarchical barrier data stored on a thread.  Performs the
   minimum amount of initialization required based on how the team has changed.  Returns true if
   leaf children will require both on-core and traditional wake-up mechanisms.  For example, if the
   team size increases, threads already in the team will respond to on-core wakeup on their parent
   thread, but threads newly added to the team will only be listening on the their local b_go. */
static bool
__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
                                       int gtid, int tid, kmp_team_t *team)
{
    // Checks to determine if (re-)initialization is needed
    bool uninitialized = thr_bar->team == NULL;
    bool team_changed = team != thr_bar->team;
    bool team_sz_changed = nproc != thr_bar->nproc;
    bool tid_changed = tid != thr_bar->old_tid;
    bool retval = false;

    if (uninitialized || team_sz_changed) {
        __kmp_get_hierarchy(nproc, thr_bar);
    }

    if (uninitialized || team_sz_changed || tid_changed) {
        thr_bar->my_level = thr_bar->depth-1; // default for master
        thr_bar->parent_tid = -1; // default for master
        if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
            kmp_uint32 d=0;
            while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
                kmp_uint32 rem;
                if (d == thr_bar->depth-2) { // reached level right below the master
                    thr_bar->parent_tid = 0;
                    thr_bar->my_level = d;
                    break;
                }
                else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
                    // thread is not a subtree root at next level, so this is max
                    thr_bar->parent_tid = tid - rem;
                    thr_bar->my_level = d;
                    break;
                }
                ++d;
            }
        }
        thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
        thr_bar->old_tid = tid;
        thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
        thr_bar->team = team;
        thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
    }
    if (uninitialized || team_changed || tid_changed) {
        thr_bar->team = team;
        thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
        retval = true;
    }
    if (uninitialized || team_sz_changed || tid_changed) {
        thr_bar->nproc = nproc;
        thr_bar->leaf_kids = thr_bar->base_leaf_kids;
        if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
        if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
            thr_bar->leaf_kids = nproc - tid - 1;
        thr_bar->leaf_state = 0;
        for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
    }
    return retval;
}

static void
__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
                                  int gtid, int tid, void (*reduce) (void *, void *)
                                  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
{
    KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather);
    register kmp_team_t *team = this_thr->th.th_team;
    register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
    register kmp_uint32 nproc = this_thr->th.th_team_nproc;
    register kmp_info_t **other_threads = team->t.t_threads;
    register kmp_uint64 new_state;

    int level = team->t.t_level;
#if OMP_40_ENABLED
    if (other_threads[0]->th.th_teams_microtask)    // are we inside the teams construct?
        if (this_thr->th.th_teams_size.nteams > 1)
            ++level; // level was not increased in teams construct for team_of_masters
#endif
    if (level == 1) thr_bar->use_oncore_barrier = 1;
    else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested

    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
    KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);

#if USE_ITT_BUILD && USE_ITT_NOTIFY
    // Barrier imbalance - save arrive time to the thread
    if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
        this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
    }
#endif

    (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);

    if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
        register kmp_int32 child_tid;
        new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
        if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
            if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
                kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
                KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n",
                              gtid, team->t.t_id, tid));
                kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
                flag.wait(this_thr, FALSE
                          USE_ITT_BUILD_ARG(itt_sync_obj) );
                if (reduce) {
                    for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
                        KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
                                       gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                                       team->t.t_id, child_tid));
                        (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
                    }
                }
                (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
            }
            // Next, wait for higher level children on each child's b_arrived flag
            for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
                kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
                if (last > nproc) last = nproc;
                for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
                    register kmp_info_t *child_thr = other_threads[child_tid];
                    register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
                    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
                                  "arrived(%p) == %llu\n",
                                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                                  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
                    kmp_flag_64 flag(&child_bar->b_arrived, new_state);
                    flag.wait(this_thr, FALSE
                              USE_ITT_BUILD_ARG(itt_sync_obj) );
                    if (reduce) {
                        KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
                                       gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                                       team->t.t_id, child_tid));
                        (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
                    }
                }
            }
        }
        else { // Blocktime is not infinite
            for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
                kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
                if (last > nproc) last = nproc;
                for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
                    register kmp_info_t *child_thr = other_threads[child_tid];
                    register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
                    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
                                  "arrived(%p) == %llu\n",
                                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                                  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
                    kmp_flag_64 flag(&child_bar->b_arrived, new_state);
                    flag.wait(this_thr, FALSE
                              USE_ITT_BUILD_ARG(itt_sync_obj) );
                    if (reduce) {
                        KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
                                       gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                                       team->t.t_id, child_tid));
                        (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
                    }
                }
            }
        }
    }
    // All subordinates are gathered; now release parent if not master thread

    if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
        KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
                      "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
                      __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
                      &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
        /* Mark arrival to parent: After performing this write, a worker thread may not assume that
           the team is valid any more - it could be deallocated by the master thread at any time. */
        if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
            || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
            kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
            flag.release();
        }
        else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
            thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
            kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
            flag.set_waiter(other_threads[thr_bar->parent_tid]);
            flag.release();
        }
    } else { // Master thread needs to update the team's b_arrived value
        team->t.t_bar[bt].b_arrived = new_state;
        KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
                      gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
    }
    // Is the team access below unsafe or just technically invalid?
    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
}

static void
__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
                                   int propagate_icvs
                                   USE_ITT_BUILD_ARG(void * itt_sync_obj) )
{
    KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release);
    register kmp_team_t *team;
    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
    register kmp_uint32 nproc;
    bool team_change = false; // indicates on-core barrier shouldn't be used

    if (KMP_MASTER_TID(tid)) {
        team = __kmp_threads[gtid]->th.th_team;
        KMP_DEBUG_ASSERT(team != NULL);
        KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
                      gtid, team->t.t_id, tid, bt));
    }
    else { // Worker threads
        // Wait for parent thread to release me
        if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
            || thr_bar->my_level != 0 || thr_bar->team == NULL) {
            // Use traditional method of waiting on my own b_go flag
            thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
            kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
            flag.wait(this_thr, TRUE
                      USE_ITT_BUILD_ARG(itt_sync_obj) );
            TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
        }
        else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
            // Wait on my "offset" bits on parent's b_go flag
            thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
            kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
                                 bt, this_thr
                                 USE_ITT_BUILD_ARG(itt_sync_obj) );
            flag.wait(this_thr, TRUE);
            if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
                TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
            }
            else { // Reset my bits on parent's b_go flag
                ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
            }
        }
        thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
        // Early exit for reaping threads releasing forkjoin barrier
        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
            return;
        // The worker thread may now assume that the team is valid.
        team = __kmp_threads[gtid]->th.th_team;
        KMP_DEBUG_ASSERT(team != NULL);
        tid = __kmp_tid_from_gtid(gtid);

        KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
                      gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
        KMP_MB();  // Flush all pending memory write invalidates.
    }

    nproc = this_thr->th.th_team_nproc;
    int level = team->t.t_level;
#if OMP_40_ENABLED
    if (team->t.t_threads[0]->th.th_teams_microtask ) {    // are we inside the teams construct?
        if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
            ++level; // level was not increased in teams construct for team_of_workers
        if( this_thr->th.th_teams_size.nteams > 1 )
            ++level; // level was not increased in teams construct for team_of_masters
    }
#endif
    if (level == 1) thr_bar->use_oncore_barrier = 1;
    else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested

    // If the team size has increased, we still communicate with old leaves via oncore barrier.
    unsigned short int old_leaf_kids = thr_bar->leaf_kids;
    kmp_uint64 old_leaf_state = thr_bar->leaf_state;
    team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
    // But if the entire team changes, we won't use oncore barrier at all
    if (team_change) old_leaf_kids = 0;

#if KMP_BARRIER_ICV_PUSH
    if (propagate_icvs) {
        __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
        if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
            copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
        }
        else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
            if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
                // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
                copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
                          &thr_bar->parent_bar->th_fixed_icvs);
            // non-leaves will get ICVs piggybacked with b_go via NGO store
        }
        else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
            if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
                copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
            else // leaves copy parent's fixed ICVs directly to local ICV store
                copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
                          &thr_bar->parent_bar->th_fixed_icvs);
        }
    }
#endif // KMP_BARRIER_ICV_PUSH

    // Now, release my children
    if (thr_bar->my_level) { // not a leaf
        register kmp_int32 child_tid;
        kmp_uint32 last;
        if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
            if (KMP_MASTER_TID(tid)) { // do a flat release
                // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
                thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
                // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
                ngo_load(&thr_bar->th_fixed_icvs);
                // This loops over all the threads skipping only the leaf nodes in the hierarchy
                for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
                    register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
                    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
                                  " go(%p): %u => %u\n",
                                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
                                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
                    // Use ngo store (if available) to both store ICVs and release child via child's b_go
                    ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
                }
                ngo_sync();
            }
            TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
            // Now, release leaf children
            if (thr_bar->leaf_kids) { // if there are any
                // We test team_change on the off-chance that the level 1 team changed.
                if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
                    if (old_leaf_kids) { // release old leaf kids
                        thr_bar->b_go |= old_leaf_state;
                    }
                    // Release new leaf kids
                    last = tid+thr_bar->skip_per_level[1];
                    if (last > nproc) last = nproc;
                    for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
                        register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
                        register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
                        KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
                                      " T#%d(%d:%d) go(%p): %u => %u\n",
                                      gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                                      team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
                                      child_bar->b_go + KMP_BARRIER_STATE_BUMP));
                        // Release child using child's b_go flag
                        kmp_flag_64 flag(&child_bar->b_go, child_thr);
                        flag.release();
                    }
                }
                else { // Release all children at once with leaf_state bits on my own b_go flag
                    thr_bar->b_go |= thr_bar->leaf_state;
                }
            }
        }
        else { // Blocktime is not infinite; do a simple hierarchical release
            for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
                last = tid+thr_bar->skip_per_level[d+1];
                kmp_uint32 skip = thr_bar->skip_per_level[d];
                if (last > nproc) last = nproc;
                for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
                    register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
                    register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
                    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
                                  " go(%p): %u => %u\n",
                                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
                                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
                    // Release child using child's b_go flag
                    kmp_flag_64 flag(&child_bar->b_go, child_thr);
                    flag.release();
                }
            }
        }
#if KMP_BARRIER_ICV_PUSH
        if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
            copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
#endif // KMP_BARRIER_ICV_PUSH
    }
    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
}

// ---------------------------- End of Barrier Algorithms ----------------------------

// Internal function to do a barrier.
/* If is_split is true, do a split barrier, otherwise, do a plain barrier
   If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
   Returns 0 if master thread, 1 if worker thread.  */
int
__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
              void *reduce_data, void (*reduce)(void *, void *))
{
    KMP_TIME_DEVELOPER_BLOCK(KMP_barrier);
    register int tid = __kmp_tid_from_gtid(gtid);
    register kmp_info_t *this_thr = __kmp_threads[gtid];
    register kmp_team_t *team = this_thr->th.th_team;
    register int status = 0;
    ident_t *loc = __kmp_threads[gtid]->th.th_ident;
#if OMPT_SUPPORT
    ompt_task_id_t my_task_id;
    ompt_parallel_id_t my_parallel_id;
#endif

    KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
                  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));

#if OMPT_SUPPORT
    if (ompt_enabled) {
#if OMPT_BLAME
        my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
        my_parallel_id = team->t.ompt_team_info.parallel_id;

#if OMPT_TRACE
        if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
            if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
                ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
                    my_parallel_id, my_task_id);
            }
        }
#endif
        if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
            ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
                my_parallel_id, my_task_id);
        }
#endif
        // It is OK to report the barrier state after the barrier begin callback.
        // According to the OMPT specification, a compliant implementation may
        // even delay reporting this state until the barrier begins to wait.
        this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
    }
#endif

    if (! team->t.t_serialized) {
#if USE_ITT_BUILD
        // This value will be used in itt notify events below.
        void *itt_sync_obj = NULL;
# if USE_ITT_NOTIFY
        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
            itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
# endif
#endif /* USE_ITT_BUILD */
        if (__kmp_tasking_mode == tskm_extra_barrier) {
            __kmp_tasking_barrier(team, this_thr, gtid);
            KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
                          gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
        }

        /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
           the team struct is not guaranteed to exist. */
        // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
            this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
            this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
        }

#if USE_ITT_BUILD
        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
            __kmp_itt_barrier_starting(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */
#if USE_DEBUGGER
        // Let the debugger know: the thread arrived to the barrier and waiting.
        if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
            team->t.t_bar[bt].b_master_arrived += 1;
        } else {
            this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
        } // if
#endif /* USE_DEBUGGER */
        if (reduce != NULL) {
            //KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
            this_thr->th.th_local.reduce_data = reduce_data;
        }

        if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
            __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1

        switch (__kmp_barrier_gather_pattern[bt]) {
        case bp_hyper_bar: {
            KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
            __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
                                       USE_ITT_BUILD_ARG(itt_sync_obj) );
            break;
        }
        case bp_hierarchical_bar: {
            __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
                                              USE_ITT_BUILD_ARG(itt_sync_obj));
            break;
        }
        case bp_tree_bar: {
            KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
            __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
                                      USE_ITT_BUILD_ARG(itt_sync_obj) );
            break;
        }
        default: {
            __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
                                        USE_ITT_BUILD_ARG(itt_sync_obj) );
        }
        }

        KMP_MB();

        if (KMP_MASTER_TID(tid)) {
            status = 0;
            if (__kmp_tasking_mode != tskm_immediate_exec) {
                __kmp_task_team_wait(this_thr, team
                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
            }
#if USE_DEBUGGER
            // Let the debugger know: All threads are arrived and starting leaving the barrier.
            team->t.t_bar[bt].b_team_arrived += 1;
#endif

#if USE_ITT_BUILD
            /* TODO: In case of split reduction barrier, master thread may send acquired event early,
               before the final summation into the shared variable is done (final summation can be a
               long operation for array reductions).  */
            if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
                __kmp_itt_barrier_middle(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */
#if USE_ITT_BUILD && USE_ITT_NOTIFY
            // Barrier - report frame end (only if active_level == 1)
            if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
#if OMP_40_ENABLED
                this_thr->th.th_teams_microtask == NULL &&
#endif
                team->t.t_active_level == 1)
            {
                kmp_uint64 cur_time = __itt_get_timestamp();
                kmp_info_t **other_threads = team->t.t_threads;
                int nproc = this_thr->th.th_team_nproc;
                int i;
                switch(__kmp_forkjoin_frames_mode) {
                case 1:
                    __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
                    this_thr->th.th_frame_time = cur_time;
                    break;
                case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
                    __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
                    break;
                case 3:
                    if( __itt_metadata_add_ptr ) {
                        // Initialize with master's wait time
                        kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
                        for (i=1; i<nproc; ++i) {
                            delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
                        }
                        __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
                    }
                    __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
                    this_thr->th.th_frame_time = cur_time;
                    break;
                }
            }
#endif /* USE_ITT_BUILD */
        } else {
            status = 1;
#if USE_ITT_BUILD
            if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
                __kmp_itt_barrier_middle(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */
        }
        if (status == 1 || ! is_split) {
            switch (__kmp_barrier_release_pattern[bt]) {
            case bp_hyper_bar: {
                KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
                __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
                                            USE_ITT_BUILD_ARG(itt_sync_obj) );
                break;
            }
            case bp_hierarchical_bar: {
                __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
                                                   USE_ITT_BUILD_ARG(itt_sync_obj) );
                break;
            }
            case bp_tree_bar: {
                KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
                __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
                                           USE_ITT_BUILD_ARG(itt_sync_obj) );
                break;
            }
            default: {
                __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
                                             USE_ITT_BUILD_ARG(itt_sync_obj) );
            }
            }
            if (__kmp_tasking_mode != tskm_immediate_exec) {
                __kmp_task_team_sync(this_thr, team);
            }
        }

#if USE_ITT_BUILD
        /* GEH: TODO: Move this under if-condition above and also include in
           __kmp_end_split_barrier(). This will more accurately represent the actual release time
           of the threads for split barriers.  */
        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
            __kmp_itt_barrier_finished(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */
    } else { // Team is serialized.
        status = 0;
        if (__kmp_tasking_mode != tskm_immediate_exec) {
#if OMP_41_ENABLED
            if ( this_thr->th.th_task_team != NULL ) {
                void *itt_sync_obj = NULL;
#if USE_ITT_NOTIFY
                if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
                    itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
                    __kmp_itt_barrier_starting(gtid, itt_sync_obj);
                }
#endif

                KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE);
                __kmp_task_team_wait(this_thr, team
                                               USE_ITT_BUILD_ARG(itt_sync_obj));
                __kmp_task_team_setup(this_thr, team, 0);

#if USE_ITT_BUILD
                if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
                    __kmp_itt_barrier_finished(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */
            }
#else
            // The task team should be NULL for serialized code (tasks will be executed immediately)
            KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
            KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
#endif
        }
    }
    KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
                  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));

#if OMPT_SUPPORT
    if (ompt_enabled) {
#if OMPT_BLAME
        if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
            ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
                my_parallel_id, my_task_id);
        }
#endif
        this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
    }
#endif

    return status;
}


void
__kmp_end_split_barrier(enum barrier_type bt, int gtid)
{
    KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier);
    int tid = __kmp_tid_from_gtid(gtid);
    kmp_info_t *this_thr = __kmp_threads[gtid];
    kmp_team_t *team = this_thr->th.th_team;

    if (!team->t.t_serialized) {
        if (KMP_MASTER_GTID(gtid)) {
            switch (__kmp_barrier_release_pattern[bt]) {
            case bp_hyper_bar: {
                KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
                __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
                                            USE_ITT_BUILD_ARG(NULL) );
                break;
            }
            case bp_hierarchical_bar: {
                __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
                                                   USE_ITT_BUILD_ARG(NULL));
                break;
            }
            case bp_tree_bar: {
                KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
                __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
                                           USE_ITT_BUILD_ARG(NULL) );
                break;
            }
            default: {
                __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
                                             USE_ITT_BUILD_ARG(NULL) );
            }
            }
            if (__kmp_tasking_mode != tskm_immediate_exec) {
                __kmp_task_team_sync(this_thr, team);
            } // if
        }
    }
}


void
__kmp_join_barrier(int gtid)
{
    KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier);
    register kmp_info_t *this_thr = __kmp_threads[gtid];
    register kmp_team_t *team;
    register kmp_uint nproc;
    kmp_info_t *master_thread;
    int tid;
#ifdef KMP_DEBUG
    int team_id;
#endif /* KMP_DEBUG */
#if USE_ITT_BUILD
    void *itt_sync_obj = NULL;
# if USE_ITT_NOTIFY
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
        // Get object created at fork_barrier
        itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
# endif
#endif /* USE_ITT_BUILD */
    KMP_MB();

    // Get current info
    team = this_thr->th.th_team;
    nproc = this_thr->th.th_team_nproc;
    KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
    tid = __kmp_tid_from_gtid(gtid);
#ifdef KMP_DEBUG
    team_id = team->t.t_id;
#endif /* KMP_DEBUG */
    master_thread = this_thr->th.th_team_master;
#ifdef KMP_DEBUG
    if (master_thread != team->t.t_threads[0]) {
        __kmp_print_structure();
    }
#endif /* KMP_DEBUG */
    KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
    KMP_MB();

    // Verify state
    KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
    KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
    KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
    KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));

#if OMPT_SUPPORT 
#if OMPT_TRACE
    if (ompt_enabled &&
        ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
        ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
            team->t.ompt_team_info.parallel_id,
            team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
    }
#endif
    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
#endif

    if (__kmp_tasking_mode == tskm_extra_barrier) {
        __kmp_tasking_barrier(team, this_thr, gtid);
        KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
    }
# ifdef KMP_DEBUG
    if (__kmp_tasking_mode != tskm_immediate_exec) {
        KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
                       __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
                       this_thr->th.th_task_team));
        KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
    }
# endif /* KMP_DEBUG */

    /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
       team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
       down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
       since the values are not used by __kmp_wait_template() in that case. */
    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
        this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
        this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
    }

#if USE_ITT_BUILD
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
        __kmp_itt_barrier_starting(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */

    switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
    case bp_hyper_bar: {
        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
        __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
                                   USE_ITT_BUILD_ARG(itt_sync_obj) );
        break;
    }
    case bp_hierarchical_bar: {
        __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
                                          USE_ITT_BUILD_ARG(itt_sync_obj) );
        break;
    }
    case bp_tree_bar: {
        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
        __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
                                  USE_ITT_BUILD_ARG(itt_sync_obj) );
        break;
    }
    default: {
        __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
    }
    }

    /* From this point on, the team data structure may be deallocated at any time by the
       master thread - it is unsafe to reference it in any of the worker threads. Any per-team
       data items that need to be referenced before the end of the barrier should be moved to
       the kmp_task_team_t structs.  */
    if (KMP_MASTER_TID(tid)) {
        if (__kmp_tasking_mode != tskm_immediate_exec) {
            // Master shouldn't call decrease_load().         // TODO: enable master threads.
            // Master should have th_may_decrease_load == 0.  // TODO: enable master threads.
            __kmp_task_team_wait(this_thr, team
                                 USE_ITT_BUILD_ARG(itt_sync_obj) );
        }
#if USE_ITT_BUILD
        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
            __kmp_itt_barrier_middle(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */

# if USE_ITT_BUILD && USE_ITT_NOTIFY
        // Join barrier - report frame end
        if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
#if OMP_40_ENABLED
            this_thr->th.th_teams_microtask == NULL &&
#endif
            team->t.t_active_level == 1)
        {
            kmp_uint64 cur_time = __itt_get_timestamp();
            ident_t * loc = team->t.t_ident;
            kmp_info_t **other_threads = team->t.t_threads;
            int nproc = this_thr->th.th_team_nproc;
            int i;
            switch(__kmp_forkjoin_frames_mode) {
            case 1:
                __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
                break;
            case 2:
                __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
                break;
            case 3:
                if( __itt_metadata_add_ptr ) {
                    // Initialize with master's wait time
                    kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
                    for (i=1; i<nproc; ++i) {
                        delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
                    }
                    __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
                }
                __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
                this_thr->th.th_frame_time = cur_time;
                break;
            }
        }
# endif /* USE_ITT_BUILD */
    }
#if USE_ITT_BUILD
    else {
        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
            __kmp_itt_barrier_middle(gtid, itt_sync_obj);
    }
#endif /* USE_ITT_BUILD */

#if KMP_DEBUG
    if (KMP_MASTER_TID(tid)) {
        KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
                      gtid, team_id, tid, nproc));
    }
#endif /* KMP_DEBUG */

    // TODO now, mark worker threads as done so they may be disbanded
    KMP_MB(); // Flush all pending memory write invalidates.
    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));

#if OMPT_SUPPORT
    if (ompt_enabled) {
#if OMPT_BLAME
        if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
            ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
                team->t.ompt_team_info.parallel_id,
                team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
        }
#endif

        // return to default state
        this_thr->th.ompt_thread_info.state = ompt_state_overhead;
    }
#endif
}


// TODO release worker threads' fork barriers as we are ready instead of all at once
void
__kmp_fork_barrier(int gtid, int tid)
{
    KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier);
    kmp_info_t *this_thr = __kmp_threads[gtid];
    kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
#if USE_ITT_BUILD
    void * itt_sync_obj = NULL;
#endif /* USE_ITT_BUILD */

    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
                  gtid, (team != NULL) ? team->t.t_id : -1, tid));

    // th_team pointer only valid for master thread here
    if (KMP_MASTER_TID(tid)) {
#if USE_ITT_BUILD && USE_ITT_NOTIFY
        if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
            // Create itt barrier object
            itt_sync_obj  = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
            __kmp_itt_barrier_middle(gtid, itt_sync_obj);  // Call acquired/releasing
        }
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */

#ifdef KMP_DEBUG
        register kmp_info_t **other_threads = team->t.t_threads;
        register int i;

        // Verify state
        KMP_MB();

        for(i=1; i<team->t.t_nproc; ++i) {
            KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
                           gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
                           team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
                           other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
            KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
                              & ~(KMP_BARRIER_SLEEP_STATE))
                             == KMP_INIT_BARRIER_STATE);
            KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
        }
#endif

        if (__kmp_tasking_mode != tskm_immediate_exec) {
            __kmp_task_team_setup(this_thr, team, 0);  // 0 indicates setup current task team if nthreads > 1
        }

        /* The master thread may have changed its blocktime between the join barrier and the
           fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
           access it when the team struct is not guaranteed to exist. */
        // See note about the corresponding code in __kmp_join_barrier() being performance-critical
        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
            this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
            this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
        }
    } // master

    switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
    case bp_hyper_bar: {
        KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
        __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
        break;
    }
    case bp_hierarchical_bar: {
        __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
                                           USE_ITT_BUILD_ARG(itt_sync_obj) );
        break;
    }
    case bp_tree_bar: {
        KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
        __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
                                   USE_ITT_BUILD_ARG(itt_sync_obj) );
        break;
    }
    default: {
        __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
    }
    }

    // Early exit for reaping threads releasing forkjoin barrier
    if (TCR_4(__kmp_global.g.g_done)) {
        this_thr->th.th_task_team = NULL;

#if USE_ITT_BUILD && USE_ITT_NOTIFY
        if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
            if (!KMP_MASTER_TID(tid)) {
                itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
                if (itt_sync_obj)
                    __kmp_itt_barrier_finished(gtid, itt_sync_obj);
            }
        }
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
        KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
        return;
    }

    /* We can now assume that a valid team structure has been allocated by the master and
       propagated to all worker threads. The current thread, however, may not be part of the
       team, so we can't blindly assume that the team pointer is non-null.  */
    team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
    KMP_DEBUG_ASSERT(team != NULL);
    tid = __kmp_tid_from_gtid(gtid);


#if KMP_BARRIER_ICV_PULL
    /* Master thread's copy of the ICVs was set up on the implicit taskdata in
       __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
       this data before this function is called. We cannot modify __kmp_fork_call() to look at
       the fixed ICVs in the master's thread struct, because it is not always the case that the
       threads arrays have been allocated when __kmp_fork_call() is executed. */
    {
        KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
        if (!KMP_MASTER_TID(tid)) {  // master thread already has ICVs
            // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
            KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
            __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
            copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
                      &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
        }
    }
#endif // KMP_BARRIER_ICV_PULL

    if (__kmp_tasking_mode != tskm_immediate_exec) {
        __kmp_task_team_sync(this_thr, team);
    }

#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
    kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
    if (proc_bind == proc_bind_intel) {
#endif
#if KMP_AFFINITY_SUPPORTED
        // Call dynamic affinity settings
        if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
            __kmp_balanced_affinity(tid, team->t.t_nproc);
        }
#endif // KMP_AFFINITY_SUPPORTED
#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
    }
    else if (proc_bind != proc_bind_false) {
        if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
            KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
                           __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
        }
        else {
            __kmp_affinity_set_place(gtid);
        }
    }
#endif

#if USE_ITT_BUILD && USE_ITT_NOTIFY
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
        if (!KMP_MASTER_TID(tid)) {
            // Get correct barrier object
            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
            __kmp_itt_barrier_finished(gtid, itt_sync_obj);  // Workers call acquired
        } // (prepare called inside barrier_release)
    }
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
}


void
__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
{
    KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy);

    KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
    KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);

    /* Master thread's copy of the ICVs was set up on the implicit taskdata in
       __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
       this data before this function is called. */
#if KMP_BARRIER_ICV_PULL
    /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
       all of the worker threads can access them and make their own copies after the barrier. */
    KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
    copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
    KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
                  0, team->t.t_threads[0], team));
#elif KMP_BARRIER_ICV_PUSH
    // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
    KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
                  0, team->t.t_threads[0], team));
#else
    // Copy the ICVs to each of the non-master threads.  This takes O(nthreads) time.
    ngo_load(new_icvs);
    KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
    for (int f=1; f<new_nproc; ++f) { // Skip the master thread
        // TODO: GEH - pass in better source location info since usually NULL here
        KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
                      f, team->t.t_threads[f], team));
        __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
        ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
        KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
                      f, team->t.t_threads[f], team));
    }
    ngo_sync();
#endif // KMP_BARRIER_ICV_PULL
}
